import os
import json
import torch
from openai import AzureOpenAI
from tqdm import tqdm
import sys
import re
import base64
import cv2
from PIL import Image
from io import BytesIO
import random
import pandas as pd
import numpy as np
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--count_personas', action='store_true', help='Output the number of personas')
parser.add_argument('--start', type=int, default=0, help='Start index for dataset slicing')
parser.add_argument('--end', type=int, default=None, help='End index for dataset slicing (inclusive)')
parser.add_argument('--output_dir', type=str, default='results', help='Directory to save per-job JSON outputs')
# NEW ARG: comma-separated list of repetition counts
parser.add_argument('--runs_list', type=str, default='20,40,60,80,100',
                    help='Comma-separated list indicating how many times to repeat the prediction for each datapoint (e.g., "20,40,60")')
args = parser.parse_args()

# Parse runs_list into a list of ints and ensure they are positive
runs_list = [int(x) for x in args.runs_list.split(',') if x.strip()]
runs_list = [r for r in runs_list if r > 0]
if not runs_list:
    raise ValueError("--runs_list must contain at least one positive integer")

# After parsing args
output_dir = os.path.abspath(args.output_dir)
os.makedirs(output_dir, exist_ok=True)
args.output_dir = output_dir  # overwrite to absolute for consistency


# Remove persona_prompts and use a generic system prompt
# persona_prompts = {
#     "18-24_female": """You are a woman aged 18–24. You're fluent in digital aesthetics, raised on platforms like TikTok and Instagram. You notice instantly if something has a vibe—bold colors, expressive fonts, emotional tone, or modern, fun design. Websites that are cluttered, generic, or try-hard are less likely to appeal to you.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its visual design, layout, color scheme, and content.

# Return:
# Reason: [Why this website does or doesn't appeal to you visually and emotionally]
# Answer: [0–10] ← You must include this score.""",

#     "25-34_male": """You are a man aged 25–34. You value strong, clear, and modern visuals. You're likely to appreciate websites that are bold but not messy—clean grids, high contrast, sharp fonts, and relevant content (fitness, tech, ambition, money).

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its layout, visual punch, and message.

# Return:
# Reason: [Why this design works for you—or not]
# Answer: [0–10] ← You must include this score.""",

#     "35-44_female": """You are a woman aged 35–44. You're drawn to websites that are intentional, emotionally intelligent, and visually clean. Family, meaning, and beauty in simplicity appeal to you more than trend-driven clutter.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its design, clarity, and emotional tone.

# Return:
# Reason: [Describe what you liked—or didn't—in terms of layout, tone, and aesthetic]
# Answer: [0–10] ← You must include this score.""",

#     "45-54_male": """You are a man aged 45–54. You prefer websites that are easy to navigate, focused, and visually grounded. You're drawn to sites that reflect purpose and clarity over trend or flash.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on usability, structure, and message.

# Return:
# Reason: [What stood out to you—positively or negatively—in its design or layout]
# Answer: [0–10] ← You must include this score.""",

#     "55+_female": """You are a woman aged 55 or older. You appreciate websites that feel meaningful, visually calm, and easy to understand. Gentle color palettes, clear fonts, and emotionally warm content make a big difference.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on design simplicity and emotional tone.

# Return:
# Reason: [How the design made you feel—comforted, confused, interested, or indifferent]
# Answer: [0–10] ← You must include this score.""",
# }

GENERIC_SYSTEM_PROMPT = (
    "You are an evaluator of website aesthetics. You will be shown 5 example website screenshots with their likeability scores (0-10 scale), followed by a target website to evaluate. "
    "Your task is to judge how much you like the last website based on its visual design, layout, color scheme, and content. "
    "Return your response in this exact format:\n"
    "Answer: [0–10]\n"
    "Reason: [Briefly explain your score in minimal words]"
)


def frame_to_data_url(frame_bgr):
    # Convert the BGR frame (OpenCV format) to RGB
    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)

    # Convert the RGB frame to a PIL Image
    image = Image.fromarray(frame_rgb)
    image = image.resize((256, 256), Image.LANCZOS)
    # Create a BytesIO buffer to hold the image data
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    buffered.seek(0)

    # Encode the image data in base64
    base64_encoded_data = base64.b64encode(buffered.read()).decode('utf-8')

    # Construct the data URL
    return f"data:image/jpeg;base64,{base64_encoded_data}"

# Azure OpenAI Configuration
api_version = "2024-02-15-preview"
config_dict = {
    'api_key': "YOUR_OPENAI_API_KEY",
    'api_version': api_version,
    'azure_endpoint': "https://your-azure-openai-endpoint/"
}

def create_persona_system_prompt(persona_specification):
    """Create a system prompt based on the agent's persona specification"""
    # Extract the first sentence as the short description
    first_sentence = persona_specification.split('.')[0] + '.'
    
    # COMMENTED OUT - Original system prompt
    return f"""You are {first_sentence}
    
    {persona_specification}
    
    Your task is to judge how much you **like** this website based on its visual design, layout, color scheme, and content, considering your unique background, personality, and preferences.
    
    You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot.
    
    Return:
    Reason: [Explain your reaction based on your background and preferences]
    Answer: [0–10] ← You must include this score."""
    
    # NEW ENHANCED SYSTEM PROMPT WITH COT AND STRUCTURED EVALUATION
#     return f"""You are {first_sentence}

# {persona_specification}

# You are an expert website aesthetic evaluator. You will be shown 5 example websites with their likeability scores (0-10 scale), followed by a target website to evaluate.

# EVALUATION METHODOLOGY:
# Use this step-by-step chain-of-thought approach to systematically evaluate the target website:

# STEP 1 - FIRST IMPRESSION ANALYSIS:
# - What is your immediate emotional reaction to this website?
# - Does it feel modern, professional, outdated, cluttered, or clean?
# - How does it compare to current web design trends?

# STEP 2 - TECHNICAL DESIGN ASSESSMENT:
# - Visual Hierarchy: How well does the layout guide your eye? (Rate 1-10)
# - Color Harmony: How pleasing and cohesive is the color scheme? (Rate 1-10)
# - Typography: How readable and aesthetically pleasing are the fonts? (Rate 1-10)
# - Layout Balance: How well-balanced and organized is the content? (Rate 1-10)
# - Visual Appeal: How attractive is the overall design? (Rate 1-10)

# STEP 3 - CONTEXTUAL COMPARISON:
# - Compare this website to the 5 examples you were shown
# - Which example website is it most similar to in quality?
# - Is it better or worse than that example, and by how much?

# STEP 4 - PERSONAL PREFERENCE INTEGRATION:
# - Based on your background and expertise described above, how does this align with your aesthetic preferences?
# - What specific elements appeal to or displease you personally?

# STEP 5 - FINAL SYNTHESIS:
# - Average your technical scores from Step 2
# - Adjust based on your personal preferences (+/- 1-2 points)
# - Consider the comparative context from Step 3
# - Provide your final score with decimal precision (e.g., 7.3, 8.7)

# You can provide precise scores including decimal values to reflect nuanced judgment.

# REQUIRED OUTPUT FORMAT:
# First Impression: [Your immediate reaction]
# Technical Analysis: [Brief analysis with sub-scores for each dimension]
# Comparison: [How it compares to the examples]
# Personal Perspective: [Your unique viewpoint based on your background]
# Final Reasoning: [Synthesis of all factors]
# Answer: [Your final score 0-10 with decimals] ← You must include this numerical score."""

def get_json_data_generate(sys_prompt, user_prompt, images):
    # images: list of (data_url, score) tuples, last one is the target
    # Build the message with all images
    user_content = [{"type": "text", "text": user_prompt}]
    for idx, (img_url, score) in enumerate(images):
        if idx < len(images) - 1:
            # Example images
            user_content.append({
                "type": "image_url",
                "image_url": {"url": img_url, "detail": "low"},
                "score": f"{score:.2f}"
            })
        else:
            # The image to be scored
            user_content.append({
                "type": "image_url",
                "image_url": {"url": img_url, "detail": "high"}
            })
    return {
        "messages": [
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": user_content}
        ]
    }

def verbalize(prompt, sys_prompt, images):
    json_data = get_json_data_generate(sys_prompt, prompt, images)
    client = AzureOpenAI(
        api_key=config_dict['api_key'],
        api_version=config_dict['api_version'],
        azure_endpoint=config_dict['azure_endpoint'],
    )
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=json_data["messages"],
        max_tokens=350,
        temperature=0.85,
        n=1
    )
    return response.choices[0].message.content.strip()

# Remove persona_prompts and use a generic system prompt
# persona_prompts = {
#     "18-24_female": """You are a woman aged 18–24. You're fluent in digital aesthetics, raised on platforms like TikTok and Instagram. You notice instantly if something has a vibe—bold colors, expressive fonts, emotional tone, or modern, fun design. Websites that are cluttered, generic, or try-hard are less likely to appeal to you.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its visual design, layout, color scheme, and content.

# Return:
# Reason: [Why this website does or doesn't appeal to you visually and emotionally]
# Answer: [0–10] ← You must include this score.""",

#     "25-34_male": """You are a man aged 25–34. You value strong, clear, and modern visuals. You're likely to appreciate websites that are bold but not messy—clean grids, high contrast, sharp fonts, and relevant content (fitness, tech, ambition, money).

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its layout, visual punch, and message.

# Return:
# Reason: [Why this design works for you—or not]
# Answer: [0–10] ← You must include this score.""",

#     "35-44_female": """You are a woman aged 35–44. You're drawn to websites that are intentional, emotionally intelligent, and visually clean. Family, meaning, and beauty in simplicity appeal to you more than trend-driven clutter.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on its design, clarity, and emotional tone.

# Return:
# Reason: [Describe what you liked—or didn't—in terms of layout, tone, and aesthetic]
# Answer: [0–10] ← You must include this score.""",

#     "45-54_male": """You are a man aged 45–54. You prefer websites that are easy to navigate, focused, and visually grounded. You're drawn to sites that reflect purpose and clarity over trend or flash.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on usability, structure, and message.

# Return:
# Reason: [What stood out to you—positively or negatively—in its design or layout]
# Answer: [0–10] ← You must include this score.""",

#     "55+_female": """You are a woman aged 55 or older. You appreciate websites that feel meaningful, visually calm, and easy to understand. Gentle color palettes, clear fonts, and emotionally warm content make a big difference.

# You are given 5 example website screenshots and how much everyone liked them (on a 0–10 scale). You're now shown a new website screenshot. Your task is to judge how much you **like** this website based on design simplicity and emotional tone.

# Return:
# Reason: [How the design made you feel—comforted, confused, interested, or indifferent]
# Answer: [0–10] ← You must include this score.""",
# }

# Info for the user
print(f"Using generic system prompt for all predictions. Will run repetitions per datapoint for counts: {runs_list}")

# Load test data
test_filename = "/path/to/test_list.csv"
df = pd.read_csv(test_filename)

# Determine slice
start_idx = args.start
end_idx = args.end if args.end is not None else df.shape[0] - 1


# =========================
# MAIN EVALUATION LOOP
# =========================

# Wrap runs_list with tqdm for high-level progress
for n_runs in tqdm(runs_list, desc="Run counts", position=0):
    print("\n" + "=" * 80)
    print(f"Running evaluation with {n_runs} repetitions per datapoint...")
    print("=" * 80)

    response_dict = []

    # Predefine output path so we can incrementally flush results
    output_file = os.path.join(
        output_dir,
        f"forecast_results_runs{n_runs}_{args.start}_{args.end if args.end is not None else 'end'}.json",
    )

    for i in tqdm(range(start_idx, end_idx + 1), desc="Datapoints", position=0, leave=True):
        try:
            d = df.iloc[i]
            value = d.to_dict()

            # Prepare the target image once for all repetitions (it's the same)
            image_path = (
                "/path/to/website_aesthetics_datasets/"  # base path
                "rating-based-dataset/images/" + d["image"].replace("_resized", "")
            )
            image = cv2.imread(image_path)
            image_url = frame_to_data_url(image)

            predictions = []
            reasons = []

            # Repeat prediction n_runs times with nested tqdm (leave=False to avoid clutter)
            for _ in tqdm(range(n_runs), desc=f"Reps x{n_runs}", position=1, leave=False):
                # --- Few-shot sampling (5 random examples) ---
                other_indices = list(range(df.shape[0]))
                other_indices.remove(i)
                sample_indices = random.sample(other_indices, min(5, len(other_indices)))

                example_lines = []
                example_images = []
                for idx in sample_indices:
                    row = df.iloc[idx]
                    fname = row["image"]
                    score = row["mean_score"]
                    img_path = (
                        "/path/to/website_aesthetics_datasets/"
                        "rating-based-dataset/images/" + fname.replace("_resized", "")
                    )
                    img = cv2.imread(img_path)
                    img_url = frame_to_data_url(img)
                    example_lines.append(f"Score: {score:.1f}")
                    example_images.append((img_url, score))

                # Add the current image as the last one
                example_images.append((image_url, None))
                examples_text = "\n".join(example_lines)

                # Create the user prompt
                prompt = (
                    "Given the images below, the first 5 are example website screenshots "
                    "with their likeability scores (on a 0–10 scale, see the list below). "
                    "The last image is the one you should score. Carefully consider the last "
                    "image and give a score between 0 to 10 based on how much you like the "
                    "website's visual design, layout, colors, and content.\n\nHere are 5 "
                    f"example likeability scores (in order):\n{examples_text}"
                )

                # --- Call the model ---
                try:
                    resp = verbalize(prompt, GENERIC_SYSTEM_PROMPT, example_images)
                    number_matches = re.findall(r"Answer:\s*(\d+(?:\.\d+)?)", resp)
                    reason_match = re.search(r"Reason:\s*(.*)", resp)
                    pred_value = float(number_matches[-1]) if number_matches else None
                    pred_reason = reason_match.group(1).strip() if reason_match else resp
                except Exception as e:
                    print(f"Error during repetition with generic prompt: {e}")
                    pred_value = None
                    pred_reason = f"Error: {str(e)}"

                if pred_value is not None:
                    predictions.append(pred_value)
                reasons.append(pred_reason)

            # After repetitions, compute mean prediction
            mean_prediction = float(np.mean(predictions)) if predictions else None

            # Store aggregated results for this datapoint
            value.update(
                {
                    "predictions": predictions,  # list of raw predictions (may be fewer than n_runs if errors)
                    "mean_prediction": mean_prediction,
                    "ground_truth": d["mean_score"],
                }
            )
            response_dict.append(value)

            # Incrementally flush results to disk to avoid losing progress
            try:
                with open(output_file, "w") as f_out:
                    json.dump(response_dict, f_out, indent=2)
            except Exception as e:
                print(f"Warning: Failed to write incremental JSON to {output_file}: {e}")

        except Exception as e:
            print(f"Error processing row {i}: {e}")
            continue

    # Final flush already done inside loop; ensure final pretty write (might be redundant but keeps format)
    try:
        with open(output_file, "w") as f_final:
            json.dump(response_dict, f_final, indent=2)
    except Exception as e:
        print(f"Error writing final JSON to {output_file}: {e}")

    print(f"Evaluation with {n_runs} runs completed. Results saved to {output_file}") 